Chapter 2 – Small Worlds and Large Worlds

Author

Danton Noriega-Goodwin

Published

January 25, 2024

Code

# set-up
using Plots
using Colors
using StatsPlots
using StatsBase
using DataFrames
using Distributions
using SpecialFunctions

default(
  labels=false, fontfamily="sans-serif", titlefontsize=10, labelfontsize=8,
  legendfontsize=5, color=:steelblue2, fillcolor=:steelblue2)

Code 2.1

samp = ["W","L","W","W","W","L","W","L","W"]
W = sum(samp .== "W")
L = sum(samp .== "L")
p = collect(0:0.25:1) # proportions W
ways = map( x -> (x*4)^W * ((1-x)*4)^L, p)
prob = ways ./ sum(ways)
hcat(p, ways, prob)

5×3 Matrix{Float64}:
 0.0     0.0  0.0
 0.25   27.0  0.0212934
 0.5   512.0  0.403785
 0.75  729.0  0.574921
 1.0     0.0  0.0

Code 2.2

bar(p, prob, xlabel="proportion water", ylabel="probability", legend=false)

Code 2.3

samp = ["W","L","W","W","W","L","W","L","W"]
W = sum(samp .== "W")
L = sum(samp .== "L")
poss = collect(0:0.25:1) # proportions W
ways = map( x -> (x)^W * (1-x)^L, poss)
post = ways ./ sum(ways)
hcat(poss, ways, post)

5×3 Matrix{Float64}:
 0.0   0.0          0.0
 0.25  0.000102997  0.0212934
 0.5   0.00195312   0.403785
 0.75  0.00278091   0.574921
 1.0   0.0          0.0

Code 2.4

function compute_posterior( the_sample , poss )
  W = sum(the_sample .== "W")
  L = sum(the_sample .== "L")
  ways = map( x -> (x)^W * (1-x)^L, poss)
  post = ways ./ sum(ways)
  DataFrame(
    poss=poss, 
    ways=ways, 
    post=round.(post, digits=3))
end

compute_posterior (generic function with 1 method)

Code 2.5

the_sample = ["W","L","W","W","W","L","W","L","W"]
the_possibilities = collect(0:0.25:1)
compute_posterior( the_sample , the_possibilities )

5×3 DataFrame

Row	poss	ways	post
	Float64	Float64	Float64
1	0.0	0.0	0.0
2	0.25	0.000102997	0.021
3	0.5	0.00195312	0.404
4	0.75	0.00278091	0.575
5	1.0	0.0	0.0

Code 2.6

collect(range(0,1, length=11))

11-element Vector{Float64}:
 0.0
 0.1
 0.2
 0.3
 0.4
 0.5
 0.6
 0.7
 0.8
 0.9
 1.0

Code 2.7

the_possibilities = collect(range(0,1, length=11))
compute_posterior( the_sample, the_possibilities)

11×3 DataFrame

Row	poss	ways	post
	Float64	Float64	Float64
1	0.0	0.0	0.0
2	0.1	7.29e-7	0.0
3	0.2	3.2768e-5	0.003
4	0.3	0.000250047	0.021
5	0.4	0.000884736	0.074
6	0.5	0.00195312	0.164
7	0.6	0.00298598	0.251
8	0.7	0.00317652	0.267
9	0.8	0.00209715	0.176
10	0.9	0.000531441	0.045
11	1.0	0.0	0.0

Code 2.8

# 10 sided die
the_possibilities = collect(range(0, 1 , length=10+1))
post = compute_posterior( the_sample , the_possibilities)
fig2_6_left = bar(
  the_possibilities, post.post, fillcolor=:steelblue2, legend=false, ylim=(0,0.3),
  xlabel="proportion water", ylabel="probability",
  title=string(length(the_possibilities), " possibilities"))

## NOT IN BOOK -- Reproduce Fig 2.6
# 20 sided die
the_possibilities = collect(range(0 , 1, length=20+1))
post = compute_posterior( the_sample , the_possibilities)
fig2_6_right = bar(
  the_possibilities, post.post, fillcolor=:tomato, legend=false, ylim=(0,0.3),
  xlabel="proportion water", ylabel="probability", 
  title=string(length(the_possibilities), " possibilities"))

display(plot(fig2_6_left, fig2_6_right, layout=(1,2), size = (600,300), linecolor=nothing))

Code 2.9

function compute_posterior2(p, the_sample, the_prior)
    W = sum(the_sample .== "W")
    L = sum(the_sample .== "L")
    W_prior = sum(the_prior .== "W")
    L_prior = sum(the_prior .== "L")
    pdf(Beta(W + W_prior + 1, L + L_prior + 1), p)
end

# Sample data
the_sample = ["W","L","W","W","W","L","W","L","W"]

# Generate a vector of x values
x = 0:0.01:1

# Single Plot (bottom right of fig 2.7)
post = compute_posterior2(x, the_sample, [])
plot(
      x, y=post, linewidth=2,
      xlabel="", ylabel="", yticks = [],
      legend=:outertop, legendfontsize=8, 
      legendforegroundcolor=nothing,
      legendbackgroundcolor=nothing,
    )

## NOT IN BOOK -- Reproduce Fig 2.7

# Create a 3x3 array to store individual plots
n = length(the_sample)
plots = Plots.Plot[plot() for i in 1:n]

# iterate through the sample one at a time, keeping a vector prior obs
for i in 1:n
    the_prior = the_sample[1:(i-1)]
    y_post = compute_posterior2(x, the_sample[1:i], the_prior)
    y_prior = compute_posterior2(x, the_sample[1:i-1], the_prior[1:i-2])
    label = join(the_sample[1:i], " ")
    col = the_sample[i] == "W" ? :steelblue2 : :tomato
    p = plot(
      x, y_post, color=col, linewidth=2,
      label=label, xlabel="", ylabel="", yticks = [],
      legend=:outertop, legendfontsize=8, 
      legendforegroundcolor=nothing,
      legendbackgroundcolor=nothing,
    )
    plots[i] = plot!(
      p, x, y_prior, color = :black, linestyle = :dash, linewidth=1.5)
end

# Splat (...) `plots` into a single 3x3 grid plot (fig2_7)
fig2_7 = plot(plots..., layout=(3,3), size = (700,700));
display(fig2_7)

Code 2.10

post_samples = rand(Beta(6+1, 3+1), 1000);

Code 2.11

density(post_samples, linewidth=3, linecolor=:tomato,
  xlabel="proportion water", label="Density")

# Overlay the beta distribution curve
x_values = 0:0.01:1
y_values = pdf(Beta(6 + 1, 3 + 1), x_values)
plot!(x_values, y_values, linestyle=:dash, linewidth=3, color=:black,
  label="Beta(7, 4)")

Code 2.12

density(post_samples, linewidth=3, linecolor=:tomato,
  bandwidth=.005, # smaller bandwith -> more wiggly
  xlabel="proportion water", label="Density", title="Wiggle Wiggle")

# Overlay the beta distribution curve
x_values = 0:0.01:1
y_values = pdf(Beta(6 + 1, 3 + 1), x_values)
plot!(x_values, y_values, linestyle=:dash, linewidth=3, color=:black,
  label="Beta(7, 4)")

Code 2.13

Need to define the precis function.

Hacked from precis.jl in `StatisticalRethinking.jl

using StatsBase: Histogram
using Statistics: quantile
using PrettyTables: pretty_table

const BARS = collect("▁▂▃▄▅▆▇█")

function unicode_histogram(data, nbins = 12)
    # @show data
    f = fit(Histogram, data, nbins = nbins)  # nbins: more like a guideline than a rule, really
    # scale weights between 1 and 8 (length(BARS)) to fit the indices in BARS
    # eps is needed so indices are in the interval [0, 8) instead of [0, 8] which could
    # result in indices 0:8 which breaks things
    scaled = f.weights .* (length(BARS) / maximum(f.weights) - eps())
    indices = floor.(Int, scaled) .+ 1
    return join((BARS[i] for i in indices))
end

function precis(x; digits = 4, a = 0.11)
    d = DataFrame()
    df = isa(x, DataFrame) ? x : DataFrame( x = x )
    cols = collect.(skipmissing.(eachcol(df)))
    d.param = names(df)
    d.mean = mean.(cols)
    d.std = std.(cols)
    quants = quantile.(cols, ([a/2, 0.5, 1-a/2], ))
    quants = hcat(quants...)
    d[:, "5.5%"] = quants[1,:]
    d[:, "50%"] = quants[2,:]
    d[:, "94.5%"] = quants[3,:]
    d.histogram = unicode_histogram.(cols, min(size(df, 1), 12))

    for col in ["mean", "std", "5.5%", "50%", "94.5%"]
        d[:, col] .= round.(d[:, col], digits = digits)
    end

    display(pretty_table(d))
end

precis (generic function with 1 method)

precis( post_samples )

┌────────┬─────────┬─────────┬─────────┬─────────┬─────────┬───────────┐
│  param │    mean │     std │    5.5% │     50% │   94.5% │ histogram │
│ String │ Float64 │ Float64 │ Float64 │ Float64 │ Float64 │    String │
├────────┼─────────┼─────────┼─────────┼─────────┼─────────┼───────────┤
│      x │  0.6344 │  0.1407 │  0.3994 │  0.6454 │  0.8418 │ ▁▁▂▄▆█▇▄▁ │
└────────┴─────────┴─────────┴─────────┴─────────┴─────────┴───────────┘

nothing

Code 2.14

density(post_samples, linewidth=3, linecolor=:tomato,
  bandwidth=.005, # smaller bandwith -> more wiggly
  xlabel="proportion water", label="Density")

# Overlay the beta distribution curve
x_values = 0:0.01:1
y_values = pdf(Beta(6 + 1, 3 + 1), x_values)
plot!(x_values, y_values, linestyle=:dash, linewidth=3, color=:black,
  label="Beta(7, 4)")
pp = [.4, .64, .84]
bb = pdf(Beta(7,4), pp)
bar!(pp, bb, bar_width=.005, fillcolor=:black, linecolor=nothing)

Code 2.15

sample(["W", "L"], AnalyticWeights([.7,.3]), 9, replace=true)

9-element Vector{String}:
 "W"
 "W"
 "W"
 "W"
 "L"
 "W"
 "W"
 "W"
 "W"

Code 2.16

function sim_globe(p = 0.7, N = 9)
  sample(["W", "L"], AnalyticWeights([p,1-p]), N, replace=true)
end

sim_globe (generic function with 3 methods)

Code 2.17

sim_globe(0.1, 10)

10-element Vector{String}:
 "L"
 "L"
 "L"
 "W"
 "L"
 "L"
 "L"
 "L"
 "L"
 "L"

Code 2.18

pred_01 = [ sum(sim_globe(0.1, 10) .== "W") for _ in 1:10000 ];

Code 2.19

bar( sort( countmap( pred_01 ) ), linecolor=nothing, bar_width=.25,
     xlabel="number of W", ylabel="count", title="p=0.1")

# NOT IN BOOK -- Add additional Plots
pred_25 = [ sum(sim_globe(0.25, 10) .== "W") for _ in 1:10000 ]
pred_60 = [ sum(sim_globe(0.6, 10) .== "W") for _ in 1:10000 ]
plot1 = bar( sort( countmap( pred_25 ) ), 
      linecolor=nothing, bar_width=.25, title="p=0.25")
plot2 = bar( sort( countmap( pred_60 ) ), 
      linecolor=nothing, bar_width=.25, title="p=0.6")
plot(plot1, plot2, layout=(1,2), xlabel="number of W", ylabel="count" )

Code 2.20

p = .64
pred_64 = [ sum(sim_globe(0.64, 10) .== "W") for _ in 1:10000 ];
bar( sort( countmap( pred_64 ) ), fillcolor=:black, linecolor=nothing, 
     bar_width=.25, xlabel="number of W", ylabel="count", title="p=0.64")

post_samples = rand(Beta(6+1, 3+1), 10000);
pred_post = [ sum( sim_globe(p, 10) .== "W" ) for p in post_samples];
tab_post = sort(countmap(pred_post));
bar!( tab_post, fillcolor=:steelblue2, linecolor=nothing, bar_width=.125)

Code 2.21

function sim_globe2( p=0.7, N=9 , x=0.1 )
  true_sample = sample(["W", "L"], AnalyticWeights([p,1-p]), N, replace=true)
  obs_sample = ifelse.(
    rand(Uniform(), N) .< x ,
    ifelse.( true_sample .== "W" , ["L"] , ["W"]),
    true_sample)
  obs_sample
end

sim_globe2 (generic function with 4 methods)

Code 2.22

# code for the normalizing constant
function ibeta( x , a , b ) 
  # (R => julia)
  ## pbeta(x, a, b) => cdf(Beta(a,b), x)
  ## pbeta(x, a, b, log.p = TRUE) => log(cdf(Beta(a,b), x))
  ## lbeta(a,b) => logbeta(a,b)
  exp( log(cdf(Beta(a, b), x)) + logbeta(a,b) )
end

function Z( x, W , L )
  ( ibeta( 1-x , L+1, W+1 ) - ibeta( x , L+1, W+1 ) ) / ( 1-2*x )
end

# data
W = 6
L = 3

x_values = 0:0.01:1
y_values = pdf(Beta(W + 1, L + 1), x_values)
plot(x_values, y_values, linewidth=3, color=:black,
  label="Beta(7, 4)", xlabel = "proportion of water",
  ylabel="posterior probability")
# new misclassification posterior
xe = 0.1
# there is no `curve` function so we generate a function and broadcast
function misclass_post( x , xe , W , L )
  (1/Z(xe,W,L)) * (x*(1-xe)+(1-x)*xe)^W * ((1-x)*(1-xe)+x*xe)^L
end
y_values = misclass_post.(x_values, xe, W, L)
plot!(x_values, y_values, linewidth=3, color=:red)

Code 2.23

function misclass_post( x , xe , W , L )
  (1/Z(xe,W,L)) * (x*(1-xe)+(1-x)*xe)^W * ((1-x)*(1-xe)+x*xe)^L
end

# port function `rethinking::grau()`
function grau(alpha=0.5)
  # http://juliagraphics.github.io/Colors.jl/stable/constructionandconversion/#Color-Parsing
  coloralpha(colorant"black", alpha)
end

function plot_sim( p , N, xe )
  sim_sample = sim_globe2( p , N , xe);
  W = sum(sim_sample .== "W")
  L = sum(sim_sample .== "L")
  #
  x_values = 0:0.01:1
  y_values = pdf(Beta(W + 1, L + 1), x_values)
  #
  plot(x_values, y_values, linewidth=2, color=:black)
  # new misclassification posterior
  y_values = misclass_post.(x_values, xe, W, L)
  vline!([0.7], color=grau(), linewidth=1)
  plot!(x_values, y_values, linewidth=2, color=:red)
  xlims!(.5, .85) # zoom in to make the subplots cleaner
end

# repeat the function a few times. observe how the new posterior, 
## which accounts for misclassification, is consistently closer to true p.
plots = Plots.Plot[plot() for i in 1:9];

# iterate through the sample one at a time, keeping a vector prior obs
p = 0.7
N = Int(1e3)
xe = 0.2
for i in 1:n
    plots[i] = plot_sim(p,N,xe)
    if i == 1
      plot!(xlabel = "proportion of water", ylabel="posterior probability")
    elseif mod(i-1,3) != 0
      plot!(yticks = [])
    end
end

# Splat (...) `plots` into a single 3x3 grid plot (fig2_7)
plot(plots..., layout=(3,3), size = (700,700), plot_title = "Ignores Missclassification (black) vs Accounts (red)", plot_titlefontsize=10)